Part 2: Visualize topic-difference

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("check")
2017-04-01 23:14:50,939 : INFO : check

Sort models by train process (epoch, doc_count)

In [2]:
import glob
import os


def cmp_key(s):
    fst, snd, _ = os.path.basename(s).split("_")
    fst = int(fst.replace("ep", ""))
    snd = int(snd.replace("docs", "").replace("doc", ""))
    
    return (fst, snd)

models = sorted(filter(lambda s: s.count(".") == 1 and "250000" not in s, glob.glob("models/lda/*")), key=cmp_key)
keyed_models = zip(map(cmp_key, models), models)
keyed_models
Out[2]:
[((1, 20000), 'models/lda/ep1_docs20000_lda.model'),
 ((1, 40000), 'models/lda/ep1_docs40000_lda.model'),
 ((1, 60000), 'models/lda/ep1_docs60000_lda.model'),
 ((1, 80000), 'models/lda/ep1_docs80000_lda.model'),
 ((1, 100000), 'models/lda/ep1_docs100000_lda.model'),
 ((2, 20000), 'models/lda/ep2_docs20000_lda.model'),
 ((2, 40000), 'models/lda/ep2_docs40000_lda.model'),
 ((2, 60000), 'models/lda/ep2_docs60000_lda.model'),
 ((2, 80000), 'models/lda/ep2_docs80000_lda.model'),
 ((2, 100000), 'models/lda/ep2_docs100000_lda.model'),
 ((3, 20000), 'models/lda/ep3_docs20000_lda.model'),
 ((3, 40000), 'models/lda/ep3_docs40000_lda.model'),
 ((3, 60000), 'models/lda/ep3_docs60000_lda.model'),
 ((3, 80000), 'models/lda/ep3_docs80000_lda.model'),
 ((3, 100000), 'models/lda/ep3_docs100000_lda.model')]

Function for draw topic2topic heatmap

In [3]:
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()

def draw_heatmap(z, title="", x_title="", y_title="", text=[]):
    data = [
        go.Heatmap(
            z=z,
            colorscale='RdBu',
            text=text
        )
    ]

    layout = go.Layout(
        width=950, 
        height=950,
        title=title,
        xaxis=dict(title=x_title),
        yaxis=dict(title=y_title))
    py.iplot(go.Figure(data=data, layout=layout))

Case 1: Difference between TWO models (Topic2Topic)

P/S We are not limited to LDA. Required matrix $\Phi$ $(Topics \times Dictionary)$ and $Dictionary$ (optional, for jaccard distance & annotations)

Difference with KL / Hellinger distance

In [4]:
import numpy as np
from gensim.models import LdaMulticore
from gensim.matutils import kullback_leibler, hellinger

def topic2topic_diff_probs(m1, m2, distance="kulback_leibler"):
    
    distances = {"kulback_leibler": kullback_leibler,
                 "hellinger": hellinger}
    assert distance in distances, "Incorrect distance, valid only {}".format(", ".join(distances.keys()))
    
    distance_func = distances[distance]
    d1, d2 = m1.state.get_lambda(), m2.state.get_lambda()
    t1_size, t2_size = d1.shape[0], d2.shape[0]
    
    z = np.zeros((t1_size, t2_size))
    
    for topic1 in range(t1_size):
        for topic2 in range(t2_size):
            if topic2 < topic1:
                continue

            curr_kl = distance_func(d1[topic1], d2[topic2])
            
            z[topic1][topic2] = curr_kl
            z[topic2][topic1] = curr_kl
    
            
    return z / np.max(z)
2017-04-01 23:14:54,530 : INFO : 'pattern' package not found; tag filters are not available for English

Difference with Jaccard distance (use top words from each topic)

In [5]:
from gensim.models import LdaMulticore
import numpy as np

def jaccard(s1, s2):
    return 1. - float(len(s1 & s2)) / float(len(s1 | s2))


def topic2topic_diff_jcd(m1, m2, num_words=100):
    t1_size, t2_size = m1.state.get_lambda().shape[0], m2.state.get_lambda().shape[0]
    
    z = np.zeros((t1_size, t2_size))
    
    fst_topics = [{w for (w, _) in m1.show_topic(topic, topn=num_words)} for topic in range(t1_size)]
    snd_topics = [{w for (w, _) in m2.show_topic(topic, topn=num_words)} for topic in range(t2_size)]
    
    
    for topic1 in range(t1_size):
        for topic2 in range(t2_size):
            if topic2 < topic1:
                continue

            curr_jcd = jaccard(fst_topics[topic1], snd_topics[topic2])
            
            z[topic1][topic2] = curr_jcd
            z[topic2][topic1] = curr_jcd
            
    return z

WordsDiff (annotation for heatmap cells)

In [6]:
from gensim.models import LdaMulticore
from random import sample
import numpy as np


def topic2topic_text(m1, m2, num_words=100, topw=10):
    t1_size, t2_size = m1.state.get_lambda().shape[0], m2.state.get_lambda().shape[0]
    
    txt = [["" for _ in range(t1_size)] for __ in range(t2_size)]
    
    fst_topics = [{w for (w, _) in m1.show_topic(topic, topn=num_words)} for topic in range(t1_size)]
    snd_topics = [{w for (w, _) in m2.show_topic(topic, topn=num_words)} for topic in range(t2_size)]
    
    for topic1 in range(t1_size):
        for topic2 in range(t2_size):
            if topic2 < topic1:
                continue
                
            pos_tokens = fst_topics[topic1] & snd_topics[topic2]
            neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2])
            
            pos_tokens = sample(pos_tokens, min(len(pos_tokens), topw))
            neg_tokens = sample(neg_tokens, min(len(neg_tokens), topw))
            
            res = "+++ {}<br>--- {}".format(", ".join(pos_tokens), ", ".join(neg_tokens))
            
            txt[topic1][topic2] = res
            txt[topic2][topic1] = res
            
    return txt

Construc topic2topic matriсes

In [ ]:
p1, p2 = keyed_models[4][1], keyed_models[12][1] # choice two random models
m1, m2 = LdaMulticore.load(p1), LdaMulticore.load(p2)

%time z_jcd = topic2topic_diff_jcd(m1, m2)
%time z_kl = topic2topic_diff_probs(m1, m2, distance="kulback_leibler")
%time z_hellinger = topic2topic_diff_probs(m1, m2, distance="hellinger")

%time annotation_text = topic2topic_text(m1, m2)

Plot topic2topic difference matrix with different topic-similarity metrics

In [8]:
draw_heatmap(z_jcd, title="Topic difference [Jaccard]", x_title="Topic", y_title="Topic", text=annotation_text)
draw_heatmap(z_kl,title="Topic difference [KL]", x_title="Topic", y_title="Topic", text=annotation_text)
draw_heatmap(z_hellinger, title="Topic difference [Hellinger]", x_title="Topic", y_title="Topic", text=annotation_text)

Case 2: Difference between sequential models one-by-one topics

Difference with KL / Hellinger distance

In [9]:
def smodel_diff_kl(models_pths, distance="kulback_leibler", num_words=100):
    distances = {"kulback_leibler": kullback_leibler,
                 "hellinger": hellinger}
    assert distance in distances, "Incorrect distance, valid only {}".format(", ".join(distances.keys()))
    
    distance_func = distances[distance]
    z = []
    
    for (p1, p2) in zip(models_pths, models_pths[1:]):
        m1, m2 = LdaMulticore.load(p1).state.get_lambda(), LdaMulticore.load(p2).state.get_lambda()
        assert m1.shape[0] == m2.shape[0]
        curr_diffs = []
        for topic in range(m1.shape[0]):
            curr_diffs.append(distance_func(m1[topic], m2[topic]))
            
        z.append(curr_diffs)

    return (z / np.max(z)).T

Difference with Jaccard distance

In [10]:
def smodel_diff_jcd(models_pths, num_words=100):
    z = []
    
    for (p1, p2) in zip(models_pths, models_pths[1:]):
        m1, m2 = LdaMulticore.load(p1), LdaMulticore.load(p2)
        
        topic_num = m1.state.get_lambda().shape[0]
        assert topic_num == m2.state.get_lambda().shape[0]
        
        curr_diffs = []
        for topic in range(topic_num):
            fst = {w for (w, _) in m1.show_topic(topic, topn=num_words)}
            snd = {w for (w, _) in m2.show_topic(topic, topn=num_words)}
            curr_diffs.append(jaccard(fst, snd))
            
        z.append(curr_diffs)

    return np.array(z).T
In [11]:
def topic_cov(z_diffs):
    z_diffs /= np.max(z_diffs)
    return np.sum(z_diffs, axis=0) / z_diffs.shape[0]

Construct one-by-one topic matrices

In [ ]:
pths = [_[1] for _ in keyed_models]

%time z_diffs_jcd = smodel_diff_jcd(pths)
%time z_diffs_kl = smodel_diff_kl(pths, distance="kulback_leibler")
%time z_diffs_hr = smodel_diff_kl(pths, distance="hellinger")

Plot one-by-one topic difference matrices with different topic-similarity metrics

In [13]:
draw_heatmap(z_diffs_jcd,
             title="Topic diff between updates [Jaccard]", x_title="Epoch diff", y_title="Topics")
draw_heatmap(z_diffs_kl,
             title="Topic diff between updates [KL]", x_title="Epoch diff", y_title="Topics")
draw_heatmap(z_diffs_hr,
             title="Topic diff between updates [Hellinger]", x_title="Epoch diff", y_title="Topics")

Calculate perplexity & coherence on holdout for comparison

In [14]:
import json
from gensim.models.coherencemodel import CoherenceModel


def calc_perplexity(model_pths, holdout_path):
    with open(holdout_path) as infile:
        holdout = [json.loads(line)["d2b"] for line in infile]
        
    result = []
    for idx, p in enumerate(model_pths):
        logging.info("Model %d of %d", idx + 1, len(model_pths))
        
        m = LdaMulticore.load(p)
        result.append(float(np.exp2(-m.log_perplexity(holdout))))
        
    return result


def calc_coherence(model_pths, holdout_path):
    with open(holdout_path) as infile:
        holdout = [json.loads(line)["d2b"] for line in infile]
        
    result = []
    for idx, p in enumerate(model_pths):
        logging.info("Model %d of %d", idx + 1, len(model_pths))
        
        m = LdaMulticore.load(p)
        cm = CoherenceModel(model=m, corpus=holdout, coherence='u_mass')
        result.append(float(cm.get_coherence()))
        
    return result
In [15]:
import os

!mkdir -p cache/

# use cache if already calculated
if not os.path.isfile("cache/perplexity.txt"):
    %time perplexity = calc_perplexity(pths, "dataset/holdout.json")
    
    with open("cache/perplexity.txt", 'w') as outfile:
        for perpl in perplexity:
            outfile.write("{}\n".format(perpl))
            
with open("cache/perplexity.txt") as infile:
    perplexity = [float(line.strip()) for line in infile]
In [16]:
!mkdir -p cache/

# use cache if already calculated
if not os.path.isfile("cache/coherence.txt"):
    %time coherence = calc_coherence(pths, "dataset/holdout.json")
    
    with open("cache/coherence.txt", 'w') as outfile:
        for coh in coherence:
            outfile.write("{}\n".format(coh))
            
with open("cache/coherence.txt") as infile:
    coherence = [float(line.strip()) for line in infile]

Calc pair one-by-one mean for this values for consistent X_axis with other plots

In [17]:
perplexity_pr = map(np.mean, zip(perplexity, perplexity[1:]))
coherence_pr = map(np.mean, zip(coherence, coherence[1:]))

Plot sum(col) scatter with different variants & perplexity & u_mass coherence

In [18]:
data = [go.Scatter(y=topic_cov(z_diffs_jcd), name="sum(jaccard)"),
        go.Scatter(y=topic_cov(z_diffs_kl), name="sum(KL)"),
        go.Scatter(y=topic_cov(z_diffs_hr), name="sum(Hellinger)")]

layout = go.Layout(
    title="Topic convergence",
    xaxis=dict(title="Epoch diff")
)


py.iplot(go.Figure(data=data, layout=layout))

py.iplot(go.Figure(data=[go.Scatter(y=perplexity_pr, 
                                    name="log(perplexity)")], 
                   layout=go.Layout(title="Perplexity")))

py.iplot(go.Figure(data=[go.Scatter(y=coherence_pr, 
                                    name="Coherence u_mass")], 
                   layout=go.Layout(title="Topic coherence")))